{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/integrations/haystack/nhs-search/notebooks/01_test_pipeline.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/integrations/haystack/nhs-search/notebooks/01_test_pipeline.ipynb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/\n",
      "ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.\n",
      "INFO - haystack.document_stores.pinecone -  Index statistics: name: haystack-nhs-jul, embedding dimensions: 768, record count: 0\n"
     ]
    }
   ],
   "source": [
    "from haystack.document_stores import PineconeDocumentStore\n",
    "\n",
    "API_KEY = \"<<YOUR_API_KEY>>\"\n",
    "INDEX_NAME = \"haystack-nhs-jul\"\n",
    "\n",
    "document_store = PineconeDocumentStore(\n",
    "    api_key=API_KEY,\n",
    "    index=INDEX_NAME,\n",
    "    embedding_dim=768\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8521, 8521)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "document_store.get_document_count(), document_store.get_embedding_count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO - haystack.modeling.utils -  Using devices: CPU\n",
      "INFO - haystack.modeling.utils -  Number of GPUs: 0\n",
      "INFO - haystack.retriever.dense -  Init retriever using embeddings of model sentence-transformers/multi-qa-mpnet-base-dot-v1\n"
     ]
    }
   ],
   "source": [
    "from haystack.retriever.dense import EmbeddingRetriever\n",
    "\n",
    "retriever = EmbeddingRetriever(\n",
    "    document_store=document_store,\n",
    "    embedding_model='sentence-transformers/multi-qa-mpnet-base-dot-v1',\n",
    "    model_format=\"sentence_transformers\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO - haystack.modeling.utils -  Using devices: CPU\n",
      "INFO - haystack.modeling.utils -  Number of GPUs: 0\n",
      "INFO - haystack.modeling.model.language_model -  LOADING MODEL\n",
      "INFO - haystack.modeling.model.language_model -  =============\n",
      "INFO - haystack.modeling.model.language_model -  Could not find deepset/roberta-base-squad2-distilled locally.\n",
      "INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...\n",
      "INFO - haystack.modeling.model.language_model -  Loaded deepset/roberta-base-squad2-distilled\n",
      "INFO - haystack.modeling.utils -  Using devices: CPU\n",
      "INFO - haystack.modeling.utils -  Number of GPUs: 0\n",
      "INFO - haystack.modeling.infer -  Got ya 9 parallel workers to do inference ...\n",
      "INFO - haystack.modeling.infer -   0     0     0     0     0     0     0     0     0  \n",
      "INFO - haystack.modeling.infer -  /w\\   /w\\   /w\\   /w\\   /w\\   /w\\   /w\\   /|\\  /w\\ \n",
      "INFO - haystack.modeling.infer -  /'\\   / \\   /'\\   /'\\   / \\   / \\   /'\\   /'\\   /'\\ \n"
     ]
    }
   ],
   "source": [
    "from haystack.nodes import FARMReader\n",
    "from haystack.pipelines import ExtractiveQAPipeline\n",
    "\n",
    "reader = FARMReader(\n",
    "    model_name_or_path='deepset/roberta-base-squad2-distilled',\n",
    "    use_gpu=True\n",
    ")\n",
    "pipe = ExtractiveQAPipeline(reader, retriever)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can begin asking questions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ffea2aea7d8743b4a2eeb63f74ed5f5f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/\n",
      "INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/\n",
      "INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/\n",
      "INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/\n",
      "INFO - haystack.modeling.model.optimization -  apex not found, won't use it. See https://nvidia.github.io/apex/\n",
      "ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.\n",
      "ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.\n",
      "ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.\n",
      "ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.\n",
      "ERROR - root -  Failed to import 'magic' (from 'python-magic' and 'python-magic-bin' on Windows). FileTypeClassifier will not perform mimetype detection on extensionless files. Please make sure the necessary OS libraries are installed if you need this functionality.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s]/Users/jamesbriggs/opt/anaconda3/envs/ml/lib/python3.9/site-packages/haystack/modeling/model/prediction_head.py:483: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n",
      "  start_indices = flat_sorted_indices // max_seq_len\n",
      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.88 Batches/s]\n",
      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.03 Batches/s]\n",
      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.26 Batches/s]\n",
      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.73 Batches/s]\n",
      "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.27 Batches/s]\n"
     ]
    }
   ],
   "source": [
    "prediction = pipe.run(\n",
    "    query=\"Who is affected by pre-eclampsia?\",\n",
    "    params={\n",
    "        \"Retriever\": {\"top_k\": 5},\n",
    "        \"Reader\": {\"top_k\": 2}\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Query: Who is affected by pre-eclampsia?\n",
      "Answers:\n",
      "[   {   'answer': 'pregnant women',\n",
      "        'context': 'atment Complications Pre-eclampsia is a condition that '\n",
      "                   'affects some pregnant women, usually during the second '\n",
      "                   'half of pregnancy (from 20 weeks) or soo'},\n",
      "    {   'answer': 'mother and baby',\n",
      "        'context': ' are mild, the condition can lead to serious complications '\n",
      "                   \"for both mother and baby if it's not monitored and \"\n",
      "                   'treated.  The earlier pre-eclampsia is d'}]\n"
     ]
    }
   ],
   "source": [
    "from haystack.utils import print_answers\n",
    "print_answers(prediction, details=\"minimum\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see the top answer seems to be correct. To extract each component here rather than print with the built in method, we can do this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'atment Complications Pre-eclampsia is a condition that affects some pregnant women, usually during the second half of pregnancy (from 20 weeks) or soo'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction['answers'][0].context"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(68, 82)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "start = prediction['answers'][0].offsets_in_context[0].start\n",
    "end = prediction['answers'][0].offsets_in_context[0].end\n",
    "\n",
    "start, end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'pregnant women'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction['answers'][0].context[start:end]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Answer {'answer': 'pregnant women', 'type': 'extractive', 'score': 0.8104832470417023, 'context': 'atment Complications Pre-eclampsia is a condition that affects some pregnant women, usually during the second half of pregnancy (from 20 weeks) or soo', 'offsets_in_document': [{'start': 140, 'end': 154}], 'offsets_in_context': [{'start': 68, 'end': 82}], 'document_id': '3bc401b213c2720c83ee9bddb0e769b8', 'meta': {'url': 'www.nhs.uk/conditions/pre-eclampsia'}}>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction['answers'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8104832470417023"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction['answers'][0].score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'www.nhs.uk/conditions/pre-eclampsia'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction['answers'][0].meta['url']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "5fe10bf018ef3e697f9035d60bf60847932a12bface18908407fd371fe880db9"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
